Testing primers against PR2

1 Goal

  • Testing primers against the PR2 database (latest version 4.11.1)

2 Notes about script

  • Chunks compute_matches and store_matches have only to be run in 2 cases
    • new version of pr2
    • new set of primers
    • In the other case, they can be set eval=FALSE

2.1 To do

  • Check why there are some negative amplicon (probably some matches ahead - Put a condition that amplicon should be >)

5 Read primer file

5.2 Build the primer set table 1

# Read from local database
pr2_db <- db_info("pr2_local")
pr2_db_con <- db_connect(pr2_db)
primers <- tbl(pr2_db_con, "pr2_primers") %>% collect()
primer_sets_all <- tbl(pr2_db_con, "pr2_primer_sets") %>% collect()
disconnect <- db_disconnect(pr2_db_con)

if (gene_selected == "18S_rRNA") {
    gene_regions = c("V4", "V9")
    # Just keep the selected primers (V4, V9 etc..) - Remove unpublished primers
    primer_sets <- primer_sets_all %>% filter(gene == gene_selected) %>% filter(gene_region %in% 
        gene_regions) %>% filter(!(primer_set_id %in% c(12, 22, 26, 63:66, 77, 83)))
} else {
    primer_sets <- primer_sets %>% filter(specificity == "plastid" | (gene == "18S_rRNA" & 
        specificity == "universal"))
}

primer_sets <- primer_sets %>% left_join(select(primers, primer_id, fwd_name = name, fwd_seq = sequence, 
    fwd_start_yeast = start_yeast, fwd_end_yeast = end_yeast), by = c(fwd_id = "primer_id")) %>% 
    left_join(select(primers, primer_id, rev_name = name, rev_seq = sequence, rev_start_yeast = start_yeast, 
        rev_end_yeast = end_yeast), by = c(rev_id = "primer_id")) %>% mutate(length_yeast = rev_end_yeast - 
    fwd_start_yeast + 1) %>% select(gene_region, specificity, primer_set_id, primer_set_name, 
    contains("fwd"), contains("rev"), length_yeast, used_in:remark) %>% select(-fwd_id, -rev_id) %>% 
    arrange(gene_region, fwd_start_yeast)

# write_tsv(primer_sets, path = 'output/primers_Table_1.tsv', na = '')
primer_sets_non_specific <- primer_sets %>% filter(is.na(specificity))

primer_sets_specific <- primer_sets %>% filter(!(is.na(specificity)))

knitr::kable(select(primer_sets, primer_set_id:primer_set_name, fwd_name, rev_name, length_yeast)) %>% 
    kableExtra::kable_styling()
primer_set_id primer_set_name fwd_name rev_name length_yeast
40 Zhan Uni18SF Uni18SR 461
13 Brate1 3NDf V4_euk_R1 465
14 Brate2 3NDf V4_euk_R2 458
18 Parfrey 515F 1119r 598
33 Needham 515F Univ 926R 589
34 Lambert 515FY NSR951 391
35 UNonMet EUK581-F EUK1134-R 578
4 Hugerth_2 563f 1132r 587
7 Bass V4_1f TAReukREV3 417
8 Stoeck_V4_2 TAReuk454FWD1 TAReukREV3 417
16 Piredda_V4 TAReuk454FWD1 V4 18S Next.Rev 417
19 Vannini Claudia Vannini (F) Claudia Vannini (R) 439
36 Stoeck_V4_1 TAReuk454FWD1 V4RB 417
1 Hadziavdic_1 F-566 R-1200 635
15 Moreno EUKAF EUKAR 410
17 Comeau E572F E1009R 438
25 Mangot NSF563 NSR951 380
2 Hadziavdic_2 A-528F R-952 379
39 Egge A-528F PRYM01+7 396
3 Hugerth_1 574*f 1132r 576
23 Venter 590F 1300R 720
21 Zimmerman D512for D978rev 437
41 Harder Cerc479F Cerc750R 283
24 Simon EK-565F-NGS EUK1134-R 527
5 Hugerth_3 616f 1132r 534
6 Hugerth_4 616*f 1132r 534
28 Amaral_1 1380F 1510R 176
29 Amaral_2 1389F 1510R 167
31 Piredda_V9 1388F 1510R 167
27 Stoeck_V9 1391F EukB 169

6 Computing the matches

This part is done by an R script PR2 Primers pr2_match.R that is executed in batch mode.

6.1 Programing Notes

  • Use Biostrings

Accessor methods : In the code snippets below, x is an MIndex object.

  • length(x): The number of patterns that matches are stored for.
  • names(x): The names of the patterns that matches are stored for.
  • startIndex(x): A list containing the starting positions of the matches for each pattern.
  • endIndex(x): A list containing the ending positions of the matches for each pattern.
  • elementNROWS(x): An integer vector containing the number of matches for each pattern.
  • x[[i]]: Extract the matches for the i-th pattern as an IRanges object.
  • unlist(x, recursive=TRUE, use.names=TRUE): Return all the matches in a single IRanges object. recursive and use.names are ignored.
  • extractAllMatches(subject, mindex): Return all the matches in a single XStringViews object.

One could also use another function which does not give the position * match_fwd <- vcountPattern(fwd, seq,max.mismatch=0, min.mismatch=0, with.indels=FALSE, fixed=FALSE, algorithm=“auto”)

8 Summarize the data in tables

9 Amplicon length

9.2 Plot an example of amplicon distribution (Fig. 3)

fig3A <- list()
fig3B <- list()


for (one_primer_set in c(4, 29)) {
    
    if (one_primer_set == 4) {
        xmin = 570
        xmax = 610
        xmax2 = 2000
    } else {
        xmin = 130
        xmax = 190
        xmax2 = 1000
        
    }
    
    pr2_match_final_one <- pr2_match_final %>% filter(primer_set_id == one_primer_set) %>% 
        filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X", "Protoalveoalata")))
    
    primer_label <- str_replace_all(pr2_match_final_one$primer_label[1], "_", " ")
    
    g <- ggplot(pr2_match_final_one, aes(x = ampli_size)) + geom_density(fill = "blue", alpha = 0.9) + 
        xlab("Amplicon size") + ggtitle(str_c("Primer set - ", primer_label)) + xlim(xmin, 
        xmax)
    
    print(g)
    
    fig3A[[one_primer_set]] <- ggplot(pr2_match_final_one, aes(x = ampli_size, fill = supergroup)) + 
        geom_density(alpha = 0.9) + theme_bw() + theme(legend.position = "none") + scale_fill_viridis_d(option = "inferno") + 
        xlab("Amplicon size (bp)") + ylab("Density") + ggtitle(str_c("Primer set - ", primer_label)) + 
        xlim(xmin, xmax)
    
    print(fig3A[[one_primer_set]])
    
    fig3B[[one_primer_set]] <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + 
        geom_boxplot(outlier.alpha = 0.3) + theme_bw() + coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("Supergroup") + ylab("Amplicon size (bp)") + ylim(0, xmax2)
    
    print(fig3B[[one_primer_set]])
    
    g <- ggplot(pr2_match_final_one, aes(x = supergroup, y = ampli_size)) + geom_violin() + 
        coord_flip() + # theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("Supergroup") + ylab("Amplicon size (bp)")
    print(g)
    
}

10 Graphics

10.1 All Eukaryotes

Comments

  • Percent of sequences amplified
    • Logically, the % of seq amplified is always < min(% of sequences matching forwar, % of sequences matching reverse)
    • In general it is the reverse primer that causes problems.
    • Some primer sets do not amplify any sequence (11, 19, 20, 21)
    • For V9, primer set 30 reverse primer is in the ITS region which is not present in the PR2 sequences, so no amplification.
  • Amplicon sizes
    • Only 8 V4 primer sets are suitable for Illumina 2x250 (max amplicon size = 450 bp)
    • This is if we consider the mean… If we consider the variation around the mean then, only 3 suitable for 2x250
    • Five more V4 primer sets are suitable for Illumina 3x250 (max amplicon size = 550 bp)
fig1 <- list()

for (one_region in gene_regions) {
    
    pr2_match_summary_primer_set_region_long <- pr2_match_summary_primer_set_long %>% filter(gene_region == 
        one_region) %>% # Remove the group specific primers
    filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
    
    pr2_match_summary_primer_set_region <- pr2_match_summary_primer_set %>% filter(gene_region == 
        one_region) %>% # Remove the group specific primers)
    filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
    
    pr2_match_region <- pr2_match_final %>% filter(gene_region == one_region) %>% # Remove the group specific primers
    filter(!(primer_set_id %in% c(35, 19, 39, 21, 41)))
    
    g <- ggplot(pr2_match_summary_primer_set_region_long) + geom_col(aes(x = reorder(primer_label, 
        -primer_set_id), y = pct_seq, fill = fct_reorder(pct_category, pct_category_order)), 
        width = 0.7, position = "dodge") + theme_bw() + xlab("Primer set") + ylab("% of sequences amplified") + 
        scale_fill_manual(name = "", values = c(pct_amplicons = "black", pct_fwd = "grey80", 
            pct_rev = "grey40"), labels = c("Amplicons", "Primer rev", "Primer fwd")) + ggtitle(str_c(one_region, 
        " - Percentage of sequences recovered")) + theme(axis.text.x = element_text(angle = 0, 
        hjust = 1)) + ylim(0, 100) + coord_flip() + guides(fill = guide_legend(nrow = 1)) + 
        theme(legend.position = "top", legend.box = "horizontal")
    
    print(g)
    
    fig1[[str_c(one_region, "pct", sep = " ")]] <- g
    
    
    g <- ggplot(filter(pr2_match_summary_primer_set_region, !is.nan(ampli_size_mean))) + geom_point(aes(x = reorder(primer_label, 
        -primer_set_id), y = ampli_size_mean), colour = "black") + geom_errorbar(aes(x = reorder(primer_label, 
        primer_set_id), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + 
        theme_bw() + xlab("Primer set") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
    ggtitle(str_c(one_region, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) + 
        geom_hline(yintercept = c(450, 550), linetype = c(2, 3)) + ylim(0, 850) + theme(axis.text.x = element_text(angle = 0, 
        hjust = 1)) + coord_flip()
    
    print(g)
    
    fig1[[str_c(one_region, "size", sep = " ")]] <- g
    
    # g <- ggplot(pr2_match_region) + geom_boxplot(aes(x=reorder(primer_label, primer_set_id),
    # y=ampli_size), colour='black', outlier.alpha = 0.3) + theme_bw() + xlab('Primer set') +
    # ylab('Amplicon size (bp)') + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500))
    # + ggtitle (str_c(one_region, ' - Amplicon size - Lines correspond to limits for Illumina
    # 2x250 and 2x300 respectively') ) + geom_hline(yintercept = c(450,550), linetype = c(2,3)
    # )+ theme(axis.text.x = element_text(angle = 45, hjust = 1)) print(g)
}

10.2 By supergroup

Comments

  • Excavata have a very different patterns from the rest of the group. They are not amplified by quite a few primer sets. They have also bigger amplicons
  • Some groups exhibit higher variability in amplicon size (e.g Chlorophyta)
for (specific_set in c(FALSE, TRUE)) {
    
    pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 
        20)) %>% filter(!(supergroup %in% c("Apusozoa", "Eukaryota_X")))
    nrow = 5
    
    if (specific_set) {
        pr2_match_summary_primer_set_sg_region <- pr2_match_summary_primer_set_sg %>% filter((n_seq > 
            20)) %>% filter((primer_set_id %in% c(6, 13, 34, 25, 33, 16))) %>% filter(!(supergroup %in% 
            c("Apusozoa", "Eukaryota_X")))
        nrow = 1
    }
    
    
    g <- ggplot(pr2_match_summary_primer_set_sg_region) + geom_col(aes(x = supergroup, y = pct_amplicons, 
        fill = supergroup), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + 
        xlab("Supergroup") + # ggtitle ('% amplified per Supergroup' ) +
    scale_fill_viridis_d(option = "inferno") + ylim(0, 100) + facet_wrap(~primer_label, scales = "fixed", 
        nrow = nrow) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", 
        legend.box = "horizontal") + theme(legend.position = "none")
    
    print(g)
    
    list_label <- str_c("pct", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
    
    fig_supergroup[[list_label]] <- g
    
    
    
    g <- ggplot(filter(pr2_match_summary_primer_set_sg_region, !is.nan(ampli_size_mean))) + 
        geom_point(aes(x = supergroup, y = ampli_size_mean), colour = "black") + theme_bw() + 
        coord_flip() + geom_errorbar(aes(x = supergroup, ymax = ampli_size_mean + ampli_size_sd, 
        ymin = ampli_size_mean - ampli_size_sd)) + xlab("Supergroup") + ylab("Amplicon size (bp)") + 
        # ggtitle (str_c(' - Amplicon size - Lines correspond to limits for Illumina 2x250 and
    # 2x300 respectively') ) +
    geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label, scales = "fixed", 
        nrow = nrow)
    
    print(g)
    
    list_label <- str_c("size", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
    
    fig_supergroup[[list_label]] <- g
    
}

10.3 By class for autotrophs

fig_class <- list()

for (specific_set in c(FALSE, TRUE)) {
    
    pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & 
        (division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta")))
    nrow = 5
    
    if (specific_set) {
        pr2_match_summary_filtered <- filter(pr2_match_summary_primer_set_class, (n_seq > 20) & 
            (division %in% c("Haptophyta", "Dinoflagellata", "Chlorophyta", "Ochrophyta", "Cryptophyta"))) %>% 
            filter((primer_set_id %in% c(16, 21)))
        nrow = 1
    }
    
    if (nrow(pr2_match_summary_filtered) > 0) {
        
        g <- ggplot(pr2_match_summary_filtered) + geom_col(data = pr2_match_summary_filtered, 
            aes(x = str_c(str_trunc(division, 20, ellipsis = ""), "-", class), y = pct_amplicons, 
                fill = class), position = "dodge") + theme_bw() + coord_flip() + ylab("% of sequences amplified") + 
            xlab("Class") + scale_fill_viridis_d(option = "inferno") + ylim(0, 100) + facet_wrap(~primer_label, 
            scales = "fixed", nrow = nrow) + guides(fill = guide_legend(nrow = 1)) + theme(legend.position = "top", 
            legend.box = "horizontal") + theme(legend.position = "none")
        
        print(g)
        
        list_label <- str_c("pct", case_when(specific_set ~ "specific", TRUE ~ ""), sep = " ")
        
        fig_class[[list_label]] <- g
        
        g <- ggplot(filter(pr2_match_summary_filtered, !is.nan(ampli_size_mean))) + geom_point(aes(x = str_c(str_trunc(division, 
            3, ellipsis = ""), "-", class), y = ampli_size_mean), colour = "black") + theme_bw() + 
            coord_flip() + geom_errorbar(aes(x = str_c(str_trunc(division, 3, ellipsis = ""), 
            "-", class), ymax = ampli_size_mean + ampli_size_sd, ymin = ampli_size_mean - ampli_size_sd)) + 
            xlab("Class") + ylab("Amplicon size (bp)") + # scale_y_continuous(breaks = (1:8)*200, limits = c(0,1500)) +
        ggtitle(str_c("Set -", one_primer_set, " - Amplicon size - Lines correspond to limits for Illumina 2x250 and 2x300 respectively")) + 
            geom_hline(yintercept = c(450, 550), linetype = 2) + facet_wrap(~primer_label, 
            scales = "fixed", nrow = nrow)
        
        print(g)
    }
}

13 Figures

Daniel Vaulot

24 10 2019